/*==============================================================================
FILE NAME: Process_Air_Investigations_Onsite_Flag.do
INPUTS: Raw data/TCEQ Workplan Definitions_July2022.xlsx
OUTPUTS: ActivityCodes_clean.dta, air_investigations_activity_codes.dta, IN_onsite.dta
UPDATED: 29 July 2025
==============================================================================*/

/* Set directory if working independently through code
if c(username)=="" { //insert username
	global rootdir "" // insert root path
	global processed_data "$rootdir/processed_data"  // Define global paths for replication package
} 
*/

// Import raw data
import excel using "$raw_data/TCEQ Workplan Definitions_July2022.xlsx", first clear

// Standardize and clean Media column, keep only AIR or ALL
replace Media=upper(Media)
replace Media=trim(Media)
keep if Media=="AIR" | Media=="ALL"

// Rename and clean activity code column, standardize PrimarilyOnSite
rename WorkplanActivity ActivityCode //rename variable for activtiy code
replace ActivityCode=upper(ActivityCode)
replace PrimarilyOnSite=upper(PrimarilyOnSite)
replace PrimarilyOnSite=trim(PrimarilyOnSite)

// Create onsite_inv flag: 1 if activity is primarily on-site, 0 otherwise
gen onsite_inv=.
replace onsite_inv=0 if PrimarilyOnSite=="NO"
replace onsite_inv=1 if PrimarilyOnSite=="YES"
label var onsite_inv "=1 if inv activity code is primarily on-site"
drop PrimarilyOnSite

// Standardize Program column, sort and save cleaned activity codes
replace Program=upper(Program)
replace Program=trim(Program)
sort ActivityCode
order ActivityCode, first
save "$processed_data/ActivityCodes_clean.dta", replace


use "$processed_data/investigations.dta", clear
rename RegulatedEntityNo RN 
rename InvestigationNo IN
keep if Media=="AIR"
keep IN ActivityCode InvestigationStartDate
duplicates drop
sort IN
save "$processed_data/air_investigations_activity_codes.dta", replace

// Python script below splits activity codes, merges onsite flags, and outputs CSV for Stata import
// Steps: Import activity codes, split by comma, clean, merge onsite flags, output CSV

python:
import pandas as pd
import os
import sfi

stata_path = sfi.Macro.getGlobal("processed_data")

print(f"Path from Stata global: {stata_path}")

os.chdir(rf"{stata_path}")

data = pd.read_stata("air_investigations_activity_codes.dta")

Codes_Split = data['ActivityCode'].str.split(',', n=-1, expand=True)

Codes_Split.rename({0: "Code1", 1:"Code2", 2:"Code3", 3:"Code4", 4:"Code5", \
                    5:"Code6", 6:"Code7", 7:"Code8"}, axis = 1, inplace = True)

for x in range(8):
    col = f"Code{x+1}"
    Codes_Split[col] = Codes_Split[col].str.lstrip()
    
Codes_separated = pd.concat([data, Codes_Split], axis = 1)

df = pd.read_stata("ActivityCodes_clean.dta")
onsite_key = df.drop("Media", axis = 1)
onsite_key.drop("Program", axis  = 1, inplace= True)
onsite_key.drop("CCEDSActivityName", axis  = 1, inplace= True)
onsite_key.drop("Definition", axis  = 1, inplace= True)

for s in range(8):
    y = "Code" + str(s+1)
    z = "onvsite_inv" + str(s+1)
    onsite_key.rename(columns={"ActivityCode": y}, inplace = True)
    onsite_key.rename(columns={"onsite_inv": z}, inplace = True)
    Codes_separated = Codes_separated.merge(onsite_key, on = y, how = "left")
    onsite_key.rename(columns={y:"ActivityCode"}, inplace = True)
    onsite_key.rename(columns={z:"onsite_inv"}, inplace = True)

Codes_separated.to_csv("air_investigations_on_site.csv", index=False)
end 
//end of python code

//Import python-created dataset to create IN_onsite
import delimited "$processed_data/air_investigations_on_site.csv", clear
rename v1 IN
save "$processed_data/air_investigations_on_site.dta", replace
egen onsite=rmax(onvsite_inv1 onvsite_inv2 onvsite_inv3 onvsite_inv4 onvsite_inv5 onvsite_inv6 onvsite_inv7 onvsite_inv8)
label var onsite "=1 if any actvitiy codes linked to IN are mostly onsite; =0 if all non-missing values are zeroes"
order onsite, after(IN)
isid IN
drop investigationstartdate activitycode onvsite_inv1-onvsite_inv8
sort IN
save "$processed_data/IN_onsite.dta", replace